"""
BProxy - an HTTP proxy that logs URI/title information
by Sean B. Palmer, 2002-10

based heavily on Mojo Nation's proxy: 
http://cvs.sf.net/cgi-bin/viewcvs.cgi/mojonation/evil/proxy/AsyncMojoProxy.py
with a few clean-ups

TODO: 
   * Only scan for a title when it's text/html
   * Enable on/off without shutting down the proxy
"""

import sys, os, os.path, re, time, urlparse, mimetools, BaseHTTPServer
import socket, asyncore, asynchat
from StringIO import StringIO

ADDR_TO_BIND_TO = '127.0.0.1'

def log(s): 
    if not s.endswith('\n'): s += '\n'
    sys.stderr.write(s)

def note(s): 
    if not s.endswith('\n'): s += '\n'
    fn = time.strftime('%Y-%m-%d.log', time.gmtime(time.time()))
    open(fn, 'a').write(s)

FILTER_ADS = os.path.exists('ad-hosts.txt') and os.path.exists('ad-paths.txt')

if FILTER_ADS: 
    ad_hosts = '(%s)' % '|'.join(open('ad-hosts.txt').read().splitlines())
    ad_paths = '(%s)' % '|'.join(open('ad-paths.txt').read().splitlines())

    def filterURI(host, path): 
        return (re.compile(ad_hosts + '$').search(host) or 
                re.compile('^[^/]*'+ad_paths).search(host+path))

class MyAsynchat(asynchat.async_chat): 
    def log_info(self, message, type='info'): 
        if __debug__ or type != 'info': # if __debug__? ugh
            sys.stderr.write('%s: %s\n' % (type, message))

class HTTPProxySender(MyAsynchat): 

    def __init__(self, receiver, id, host, port): 
        asynchat.async_chat.__init__(self)
        self.receiver = receiver
        self.id = id
        self.data = ''
        self.set_terminator(None)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.host = host
        self.port = port
        try: self.connect((host, port))
        except socket.error, e:
            log('(%d) XXX %s' % (self.id, e))
            self.receiver.sender_connection_error(e)
            self.close()
            return

    def handle_connect(self): 
        log('(%d) S handle_connect' % self.id)
        try: self.receiver.sender_is_connected()
        except socket.error, e:
            log('(%d) OOO %s' % (self.id, e))
            if hasattr(self, 'receiver'):
                self.receiver.sender_connection_error(e)
            self.close()
            return
        log('(%d) sender connected' % self.id)

    def return_error(self, e):
        log('(%d) sender got socket error: %s' % (self.id, e))
        if (isinstance(e, socket.error) 
            and (type(e.args) == type(()))
            and (len(e.args) == 2)): e = e.args[1] # get the error string only
        self.receiver.error(404, 
             'Error connecting to <em>%s</em> on port <em>%d</em>: <b>%s</b>' \
              % (self.host, self.port, e), response=str(e))
        self.close()

    def collect_incoming_data(self, data): 
        log('<== (%d) %d bytes' % (self.id, len(data)))
        self.parse_data(data)
        self.receiver.push(data)

    def parse_data(self, data): 
        if self.data is not None: 
            self.data += data

        title = re.compile(r'(?i)<title>([^<]+)</title>(?!(?:[^-]|-(?!-))*-->)')
        foundtitle = title.findall(data)
        if foundtitle: 
            t = foundtitle[0].replace('"', '\\"').strip()
            t = re.sub('[\t\r\n\f]', ' ', t)
            note('   %s "%s"' % (self.receiver.url, t))
            self.data = None

    def handle_close(self):
        log('(%d) sender closing' % self.id)
        timen = time.strftime('%H:%M:%S', time.gmtime(time.time()))
        self.receiver.close_when_done()
        del self.receiver # break circular reference
        self.close()

class HTTPProxyReceiver(MyAsynchat): 
    channel_counter = [0]

    def __init__(self, server, (conn, addr)):
        self.id = self.channel_counter[0] # used during log calls
        try: self.channel_counter[0] += 1
        except OverflowError: 
            self.channel_counter[0] = 0
        asynchat.async_chat.__init__(self, conn)
        self.set_terminator('\n')
        self.server = server
        self.buffer = StringIO()

        # in the beginning there was GET...
        self.found_terminator = self.read_http_request
    
    def collect_incoming_data(self, data): 
        self.buffer.write(data)
    
    def push_incoming_data_to_sender(self, data): 
        # e.g. when using POST or PUT
        log('==> (%d) %d bytes' % (self.id, len(data)))
        self.sender.push(data)

    def read_http_request(self):
        request = self.buffer.getvalue()
        self.buffer = StringIO()

        log('%s - %s' % (time.ctime(time.time()), request))

        # client-originated shutdown hack: 
        if request.strip() == 'quit':
            log('External quit command received.')
            raise asyncore.ExitNow

        try: 
            self.method, self.url, self.protocol = request.split()
            self.method = self.method.upper()
        except: self.error(400, "Can't parse request")

        if not self.url: self.error(400, "Empty URL")
        else: 
            timen = time.strftime('%H:%M:%S', time.gmtime(time.time()))
            note('%s %s' % (timen, self.url))

        if self.method not in ['CONNECT', 'GET', 'HEAD', 'POST', 'PUT']: 
            self.error(501, "Unknown request method (%s)" % self.method)
        if self.method == 'CONNECT': 
            self.netloc = self.url
            self.scheme = 'https'
            self.path = ''
            params, query, fragment = '', '', ''
        else:
            # split url into site and path
            (self.scheme, self.netloc, self.path, 
             params, query, fragment) = urlparse.urlparse(self.url)
            if self.scheme.lower() not in ('http', ''): 
                self.error(501, "Unknown request scheme (%s)" % self.scheme)

        # find port number
        if ':' in self.netloc:
            self.host, self.port = self.netloc.split(':')
            self.port = int(self.port)
        else:
            self.host = self.netloc
            if self.method == 'CONNECT': self.port = 443  # default SSL port
            else: self.port = 80

        # now we have the url and host

        if FILTER_ADS: 
            if filterURI(self.host, self.path): 
                self.error(404, "Not found: banned")
                del self.initiate_send # gives a big error, not many small ones

        self.original_host_and_port = None
        self.path = urlparse.urlunparse(('', '', 
                                         self.path, params, query, fragment))

        # now we have the url, host, and path

        if (self.host == '') and self.path.startswith('/'): 
            path = self.path.lstrip('/')
            if path == 'off': PROXY_STATE = 0
            elif path == 'on': PROXY_STATE = 1
            elif path == 'refererOn': REFERER_STATE = 1
            elif path == 'refererOff': REFERER_STATE = 0

        # a "file" to read the headers into for mimetools.Message
        self.rawheaders = StringIO()
        self.found_terminator = self.read_http_headers

    def read_http_headers(self):
        header = self.buffer.getvalue()
        self.buffer = StringIO()
        if header and header[0] != '\r': 
            self.rawheaders.write(header)
            self.rawheaders.write('\n')
        else:
            # all headers have been read, process them
            self.rawheaders.seek(0)
            self.mimeheaders = mimetools.Message(self.rawheaders)

            if ((self.method == 'POST' or self.method == 'PUT') 
                and not self.mimeheaders.has_key('content-length')): 
                self.error(400, "Missing Content-Length "
                                "for %s method" % self.method)
            self.length = int(self.mimeheaders.get('content-length', 0))
            del self.mimeheaders['accept-encoding']
            del self.mimeheaders['proxy-connection']

            # put in whatever User-Agent here
            ua = 'Mozilla/4.0 '
            if self.host.endswith('microsoft.com'): 
                ua += '(compatible; MSIE 5.0; Windows ME) Opera 6.01 [en]\r'
            else: 
                ua += '(compatible; MSIE 6.0; Windows 98; Win 9x 4.90)\r'
            self.mimeheaders['User-Agent'] = ua

            # strip off referer from urls we don't want referer headers on
            # referer = self.mimeheaders.get('referer', 0)
            # if (referer and 
            # referer_to_strip_re.search(self.mimeheaders['referer'])):
                # IMHO, we should -always- do this but unfortunately some
                # stupid web sites probably depend on it.  -greg
            #     del self.mimeheaders['referer']

            if self.port == 80: self.mimeheaders['Host'] = self.host
            else: # @@ is the host header ever supposed to have the port?
                self.mimeheaders['Host'] = '%s:%s' % (self.host, self.port)
            self.mimeheaders['Host'] += '\r' # some sites break without this

            self.sender = HTTPProxySender(self, self.id, self.host, self.port)
            self.push_request_to_sender()
    
    def push_request_to_sender(self): 
        headers = ''.join(self.mimeheaders.headers)
        request = '%s %s HTTP/1.0\r\n%s\r\n' % (self.method, self.path, headers)

        if self.original_host_and_port:
            log('(%d) sending req. (original_host_and_port):' % self.id)
        else: log('(%d) sending request to server:' % self.id)
        log(`request`)

        self.sender.push(request)
        self.set_terminator(None)
        self.buffer = StringIO()
    
    def sender_is_connected(self):
        """The sender calls this to tell us when it is ready for more data."""
        log('(%d) R sender_is_connected()' % self.id)
        # sender gave us the OK, give it our buffered data and any future data
        self.push_incoming_data_to_sender(self.buffer.getvalue())
        self.buffer = None
        self.collect_incoming_data = self.push_incoming_data_to_sender
    
    def sender_connection_error(self, e):
        log('(%d) R sender_connection_error(%s) for %s:%s\n' % (self.id, e, 
                                                        self.host, self.port))
        # if this was a redirected request and the redirection failed...
        if self.original_host_and_port:
            self.sender = HTTPProxySender(self, self.id, self.host, self.port)
            self.push_request_to_sender()
            return
        if (isinstance(e, socket.error) 
            and type(e.args) == type(()) 
            and len(e.args) == 2): e = e.args[1]  # get the error string only
        self.error(404, 'Error connecting to <em>%s</em> on port '
          '<em>%d</em>: <b>%s</b>' % (self.host, self.port, e), response=str(e))

    def handle_close(self): 
        log('(%d) receiver closing' % self.id)
        if hasattr(self, 'sender'): 
            # self.sender.close() should be fine except for PUT requests?
            self.sender.close_when_done()
            del self.sender # break circular reference
        self.close()
    
    def show_error(self, code, body, response=None): 
        if not response:
            response = BaseHTTPServer.BaseHTTPRequestHandler.responses[code][0]
        self.push("HTTP/1.0 %s %s\r\n" % (code, response))
        self.push("Server: B-Proxy\r\n")
        self.push("Content-type: text/html\r\n")
        self.push("\r\n")
        self.push('<html>\n<head>\n<title>%d %s</title>\n</head>\n'
                  '<body>\n%s\n</body>\n</html>' % (code, response, body))

    def error(self, code, body, response=None):
        self.show_error(code, body, response=response)
        if hasattr(self, 'sender'):
            self.sender.handle_close()
            del self.sender  # break circular reference
        self.close()

class HTTPProxyServer(asyncore.dispatcher): 

    def __init__(self, port):
        asyncore.dispatcher.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        self.set_reuse_addr()
        self.ouraddr = (ADDR_TO_BIND_TO, port)
        log('Starting proxy on %s port %d' % self.ouraddr)
        self.bind(self.ouraddr)
        self.listen(5)
        self.rs = []

    def handle_accept(self): 
        HTTPProxyReceiver(self, self.accept())

    def log_info(self, message, type='info'): 
        if __debug__ or type != 'info': 
            sys.stderr.write('%s: %s\n' % (type, message))

if __name__ == '__main__':
    log('Stopping external proxies:')
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try: 
        try: 
            s.connect(('localhost', 8000))
            s.send('quit\r\n')
        finally: s.close()
    except: log('Could not connect to locahost 8000, oh well...')

    if len(sys.argv) >= 2: PORT = int(sys.argv[1])
    else: PORT = 8000

    ps = HTTPProxyServer(PORT)
    log('Starting service...')
    asyncore.loop()

# [EOF]